setwd("~/Desktop/Coursework_files")
getwd()
## [1] "/Users/sangbin/Desktop/Coursework_files"

Data preparation

Library needed

library(vioplot)
## Loading required package: sm
## Package 'sm', version 2.2-5.7: type help(sm) for summary information
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tidyr)
library(ggplot2) 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggstatsplot)
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggpubr)
library(caret)
## Loading required package: lattice
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(superml)
## Loading required package: R6
library(caTools)
library(ranger)
library(rpart)
library(ROSE)
## Loaded ROSE 0.0-4

Data import and preparation

  • Read all the data we will be using.
airports <- read.csv("./dataverse_files/airports.csv", sep=",", header = TRUE)
carriers <- read.csv("./dataverse_files/carriers.csv", sep=",", header = TRUE)
planes <- read.csv("./dataverse_files/plane-data.csv", sep=",", header = TRUE)
variable_descriptions <- read.csv("/Users/sangbin/Desktop/Coursework_files/dataverse_files/variable-descriptions.csv",sep=",", header = TRUE)

df_03 <- read.csv("./dataverse_files/2003.csv.bz2", sep=",", header = TRUE)
df_04 <- read.csv("./dataverse_files/2004.csv.bz2", sep=",", header = TRUE)
df_05 <- read.csv("./dataverse_files/2005.csv.bz2", sep=",", header = TRUE)


  • Concatenate all the flight detail dataset into one dataset.
df <- rbind(df_03, df_04, df_05)


  • Create full date column
df$Date <- make_date(df$Year,df$Month,df$DayofMonth)


  • We are able to identify that if the flight is cancelled or diverted, there is no delay occurred. Hence we will filter out the cancelled and diverted flights for questions 1 to 4 just for now.


df_copy <- df
df_copy <- subset(df_copy, Cancelled == 0 & Diverted == 0)
df_copy <- subset(df_copy, select=-c(CancellationCode))


  • Make new columns to indicate delay of arrival and departure. 0 = on time , 1 = delayed
df_copy$DelayedArr <- 0
df_copy$DelayedDep <- 0


  • According to Bureau of Transportation statistics, a flight is counted as “on time” if the flight operated less than 15 minutes later than the scheduled time shown in the carriers’ Computerized Reservations Systems (CRS).Thus, On-time means a flight that arrives less than 15 minutes after its announced arrival time.
df_copy$DelayedArr[df_copy$ArrDelay > 15] <- 1
df_copy$DelayedDep[df_copy$DepDelay > 15] <- 1


  • Missing data (or Not a Number data) tells us that these features are not the cause of it. Hence convert NaN values into 0
df_copy$CarrierDelay <- df_copy$CarrierDelay %>% replace_na(0)
df_copy$WeatherDelay <- df_copy$WeatherDelay %>% replace_na(0)
df_copy$NASDelay <- df_copy$NASDelay %>% replace_na(0)
df_copy$SecurityDelay <- df_copy$SecurityDelay %>% replace_na(0)
df_copy$LateAircraftDelay <- df_copy$LateAircraftDelay %>% replace_na(0)


  • We can identify that there are no missing values except “ArrDelay” and “ActualElapsedTime”. Hence, drop na values
summary(df_copy)
##       Year          Month          DayofMonth      DayOfWeek        DepTime    
##  Min.   :2003   Min.   : 1.000   Min.   : 1.00   Min.   :1.000   Min.   :   1  
##  1st Qu.:2003   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000   1st Qu.: 936  
##  Median :2004   Median : 7.000   Median :16.00   Median :4.000   Median :1330  
##  Mean   :2004   Mean   : 6.524   Mean   :15.73   Mean   :3.942   Mean   :1345  
##  3rd Qu.:2005   3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:6.000   3rd Qu.:1733  
##  Max.   :2005   Max.   :12.000   Max.   :31.00   Max.   :7.000   Max.   :2805  
##                                                                                
##    CRSDepTime      ArrTime       CRSArrTime   UniqueCarrier        FlightNum   
##  Min.   :   0   Min.   :   1   Min.   :   0   Length:20356257    Min.   :   1  
##  1st Qu.: 935   1st Qu.:1120   1st Qu.:1125   Class :character   1st Qu.: 584  
##  Median :1327   Median :1522   Median :1525   Mode  :character   Median :1415  
##  Mean   :1339   Mean   :1497   Mean   :1504                      Mean   :2038  
##  3rd Qu.:1725   3rd Qu.:1915   3rd Qu.:1910                      3rd Qu.:2928  
##  Max.   :2400   Max.   :2955   Max.   :2400                      Max.   :9912  
##                                                                                
##    TailNum          ActualElapsedTime CRSElapsedTime    AirTime       
##  Length:20356257    Min.   :-710.0    Min.   : -32   Min.   :-3818.0  
##  Class :character   1st Qu.:  72.0    1st Qu.:  75   1st Qu.:   53.0  
##  Mode  :character   Median : 104.0    Median : 105   Median :   84.0  
##                     Mean   : 123.5    Mean   : 125   Mean   :  102.6  
##                     3rd Qu.: 154.0    3rd Qu.: 155   3rd Qu.:  134.0  
##                     Max.   :1849.0    Max.   :1441   Max.   : 3508.0  
##                     NA's   :1                                         
##     ArrDelay            DepDelay            Origin              Dest          
##  Min.   :-1302.000   Min.   :-1410.000   Length:20356257    Length:20356257   
##  1st Qu.:  -10.000   1st Qu.:   -4.000   Class :character   Class :character  
##  Median :   -2.000   Median :    0.000   Mode  :character   Mode  :character  
##  Mean   :    5.827   Mean   :    7.304                                        
##  3rd Qu.:   10.000   3rd Qu.:    5.000                                        
##  Max.   : 1925.000   Max.   : 1930.000                                        
##  NA's   :1                                                                    
##     Distance        TaxiIn            TaxiOut          Cancelled    Diverted
##  Min.   :   8   Min.   :   0.000   Min.   :   0.00   Min.   :0   Min.   :0  
##  1st Qu.: 308   1st Qu.:   4.000   1st Qu.:  10.00   1st Qu.:0   1st Qu.:0  
##  Median : 551   Median :   5.000   Median :  13.00   Median :0   Median :0  
##  Mean   : 719   Mean   :   7.561   Mean   :  15.69   Mean   :0   Mean   :0  
##  3rd Qu.: 948   3rd Qu.:   7.000   3rd Qu.:  18.00   3rd Qu.:0   3rd Qu.:0  
##  Max.   :4962   Max.   :1523.000   Max.   :3905.00   Max.   :0   Max.   :0  
##                                                                             
##   CarrierDelay       WeatherDelay          NASDelay        SecurityDelay     
##  Min.   :   0.000   Min.   :   0.0000   Min.   : -60.000   Min.   :  0.0000  
##  1st Qu.:   0.000   1st Qu.:   0.0000   1st Qu.:   0.000   1st Qu.:  0.0000  
##  Median :   0.000   Median :   0.0000   Median :   0.000   Median :  0.0000  
##  Mean   :   2.399   Mean   :   0.5766   Mean   :   2.966   Mean   :  0.0197  
##  3rd Qu.:   0.000   3rd Qu.:   0.0000   3rd Qu.:   0.000   3rd Qu.:  0.0000  
##  Max.   :1925.000   Max.   :1510.0000   Max.   :1385.000   Max.   :533.0000  
##                                                                              
##  LateAircraftDelay       Date              DelayedArr       DelayedDep    
##  Min.   :   0.000   Min.   :2003-01-01   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:   0.000   1st Qu.:2003-10-19   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :   0.000   Median :2004-07-19   Median :0.0000   Median :0.0000  
##  Mean   :   2.985   Mean   :2004-07-12   Mean   :0.1848   Mean   :0.1517  
##  3rd Qu.:   0.000   3rd Qu.:2005-04-11   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1407.000   Max.   :2005-12-31   Max.   :1.0000   Max.   :1.0000  
## 
df_copy <- df_copy %>% drop_na(ArrDelay)


  • convert concatenated hours and minutes into separate columns Hour
df_copy$DepHour = df_copy$DepTime %/% 100L
df_copy$ArrHour = df_copy$ArrTime %/% 100L
df_copy$CRSDepHour = df_copy$CRSDepTime %/% 100L
df_copy$CRSArrHour = df_copy$CRSArrTime %/% 100L


  • Convert Hour’s data type numeric to character.
df_copy$DepHour <- as.character(df_copy$DepHour)
df_copy$ArrHour <- as.character(df_copy$ArrHour)
df_copy$CRSDepHour <- as.character(df_copy$CRSDepHour)
df_copy$CRSArrHour <- as.character(df_copy$CRSArrHour)


  • Convert the time into 24-hour notation.
df_copy$DepHour = with(df_copy, ifelse(DepHour %in% c("24","0") , "00",
                                       ifelse(DepHour %in% c("25","1") , "01",
                                              ifelse(DepHour %in% c("26","2") , "02",
                                                     ifelse(DepHour %in% c("27","3") , "03",
                                                            ifelse(DepHour %in% c("28","4") , "04",
                                                                   ifelse(DepHour %in% c("29","5") , "05",
                                                                          ifelse(DepHour %in% c("6") , "06",
                                                                                 ifelse(DepHour %in% c("7") , "07",
                                                                                        ifelse(DepHour %in% c("8") , "08",
                                                                                               ifelse(DepHour %in% c("9") , "09",
                                                                                                      DepHour)))))))))))

df_copy$ArrHour = with(df_copy, ifelse(ArrHour %in% c("24","0") , "00",
                                       ifelse(ArrHour %in% c("25","1") , "01",
                                              ifelse(ArrHour %in% c("26","2") , "02",
                                                     ifelse(ArrHour %in% c("27","3") , "03",
                                                            ifelse(ArrHour %in% c("28","4") , "04",
                                                                   ifelse(ArrHour %in% c("29","5") , "05",
                                                                          ifelse(ArrHour %in% c("6") , "06",
                                                                                 ifelse(ArrHour %in% c("7") , "07",
                                                                                        ifelse(ArrHour %in% c("8") , "08",
                                                                                               ifelse(ArrHour %in% c("9") , "09",
                                                                                                      ArrHour)))))))))))

df_copy$CRSDepHour = with(df_copy, ifelse(CRSDepHour %in% c("24","0") , "00",
                                       ifelse(CRSDepHour %in% c("25","1") , "01",
                                              ifelse(CRSDepHour %in% c("26","2") , "02",
                                                     ifelse(CRSDepHour %in% c("27","3") , "03",
                                                            ifelse(CRSDepHour %in% c("28","4") , "04",
                                                                   ifelse(CRSDepHour %in% c("29","5") , "05",
                                                                          ifelse(CRSDepHour %in% c("6") , "06",
                                                                                 ifelse(CRSDepHour %in% c("7") , "07",
                                                                                        ifelse(CRSDepHour %in% c("8") , "08",
                                                                                               ifelse(CRSDepHour %in% c("9") , "09",
                                                                                                      CRSDepHour)))))))))))

df_copy$CRSArrHour = with(df_copy, ifelse(CRSArrHour %in% c("24","0") , "00",
                                       ifelse(CRSArrHour %in% c("25","1") , "01",
                                              ifelse(CRSArrHour %in% c("26","2") , "02",
                                                     ifelse(CRSArrHour %in% c("27","3") , "03",
                                                            ifelse(CRSArrHour %in% c("28","4") , "04",
                                                                   ifelse(CRSArrHour %in% c("29","5") , "05",
                                                                          ifelse(CRSArrHour %in% c("6") , "06",
                                                                                 ifelse(CRSArrHour %in% c("7") , "07",
                                                                                        ifelse(CRSArrHour %in% c("8") , "08",
                                                                                               ifelse(CRSArrHour %in% c("9") , "09",
                                                                                                      CRSArrHour)))))))))))


  • Groupby distance to see the relationship between delays
Distance_delay <- df_copy %>%
  group_by(Distance) %>%
  summarise(Average_ArrDelay = mean(ArrDelay))


  • Average arrival delay based on distance and Frequency of the flight over distance.
a<- ggplot(data = Distance_delay , mapping = aes(x = Distance, y = Average_ArrDelay)) +
  geom_line(stat='identity') +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of arrival delay based on distance",x = "Distance", y ="Average ArrDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))

b<-ggplot(data = Distance_delay ,aes(x = Distance)) +
  geom_histogram(aes(y=..density..),
                 binwidth=100) +
  labs(title = "Frequency of the flight over distance",x = "Distance", y ="Frequency of the flight") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))+
  geom_density(alpha=.2, fill="white")

ggarrange(a,b, 
          labels = c("A", "B"),
          ncol = 2, nrow = 1)

  • We can observe that shorter distanced flights seem to have more fluctuation on delays compared to long distance flights.

  • We can also observe that there are more of shorter distanced flights compared to long distance flights.


1) When is the best time of day, day of the week, and time of year to fly to minimize delays?


Deep dive into Delayed arrival flights

Delayed_flights <- filter(df_copy,df_copy$DelayedArr == 1)


  • ArrDelay Boxplot(before)
c <- ggplot(data=Delayed_flights, aes(y=ArrDelay))+
  geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3)+
  labs(title = "ArrDelay Boxplot(before)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


  • DepDelay Boxplot(before)
d <- ggplot(data=Delayed_flights, aes(y=DepDelay))+
  geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3)+
  labs(title = "DepDelay Boxplot(before)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))

ggarrange(c,d, 
          labels = c("C", "D"),
          ncol = 2, nrow = 1)


  • Identify first, third quartile and interquartile range to identify the lower and upper whisker
Q_ArrDelay <- quantile(Delayed_flights$ArrDelay, probs=c(.25, .75), na.rm = FALSE)
IQR_ArrDelay <- IQR(Delayed_flights$ArrDelay)
Upper_Whisker_ArrDelay <-  Q_ArrDelay[2]+1.5*IQR_ArrDelay
Lower_Whisker_ArrDelay <- Q_ArrDelay[1]-1.5*IQR_ArrDelay
Q_DepDelay <- quantile(Delayed_flights$DepDelay, probs=c(.25, .75), na.rm = FALSE)
IQR_DepDelay <- IQR(Delayed_flights$DepDelay)
Upper_Whisker_DepDelay <-  Q_DepDelay[2]+1.5*IQR_DepDelay
Lower_Whisker_DepDelay <- Q_DepDelay[1]-1.5*IQR_DepDelay


  • Outliers will be any points below Lower_Whisker or above Upper_Whisker hence sort them out from arrival delayed flights.
Delayed_flights<- subset(Delayed_flights, Delayed_flights$ArrDelay > Lower_Whisker_ArrDelay & 
                           Delayed_flights$ArrDelay < Upper_Whisker_ArrDelay &
                           Delayed_flights$DepDelay > Lower_Whisker_DepDelay & 
                           Delayed_flights$DepDelay < Upper_Whisker_DepDelay)


  • ArrDelay Boxplot (after)
e <- ggplot(data=Delayed_flights, aes(y=ArrDelay))+
  geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3)+
  labs(title = "ArrDelay Boxplot(after)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


  • DepDelay Boxplot (after)
f <- ggplot(data=Delayed_flights, aes(y=DepDelay))+
  geom_boxplot(fill='slategrey',color='darkslategrey',width=0.3)+
  labs(title = "DepDelay Boxplot(after)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))

ggarrange(e,f, 
          labels = c("E", "F"),
          ncol = 2, nrow = 1)


  • Total number of flights per hour
total_flights_per_DepHour <- df_copy %>%
  group_by(DepHour) %>%
  summarise(Count = sum(DelayedArr))

g <- ggplot(data = total_flights_per_DepHour, mapping = aes(x = reorder(DepHour,-Count), y = Count)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Count,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Total number of flights per hour",x = "Actual Departure Hour", y ="Number of flights") + 
  theme(plot.title = element_text(hjust = 0.5))


  • Number of delayed flights per hour
delayed_flights_per_DepHour <- Delayed_flights %>%
  group_by(DepHour) %>%
  summarise(Count = sum(DelayedArr))

h <- ggplot(data = delayed_flights_per_DepHour, mapping = aes(x = reorder(DepHour,-Count), y = Count)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Count,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Number of delayed flights per hour",x = "Actual Departure Hour", y ="Number of delayed flights") + 
  theme(plot.title = element_text(hjust = 0.5))

ggarrange(g,h, 
          labels = c("G", "H"),
          ncol = 2, nrow = 1)

  • We can identify that number of flights differ by hour.


  • Actual Departure Hour
Delayed_flights_DepHour <- Delayed_flights %>%
  group_by(DepHour) %>%
  summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay))

i <- ggplot(data = Delayed_flights_DepHour, mapping = aes(x = DepHour, y = Average_DepDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_DepDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of delay over the hours day",x = "Actual Departure Hour", y ="Average DepDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5))

j <- ggplot(data = Delayed_flights_DepHour, mapping = aes(x = DepHour, y = Average_ArrDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of delay over the hours day",x = "Actual Departure Hour", y ="Average ArrDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5))


  • CRSDeparture Hour
Delayed_flights_CRSDepHour <- Delayed_flights %>%
  group_by(CRSDepHour) %>%
  summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay))

k <- ggplot(data = Delayed_flights_CRSDepHour, mapping = aes(x = CRSDepHour, y = Average_DepDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_DepDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of delay over the hours day",x = "CRS Departure Hour", y ="Average DepDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5))


l <- ggplot(data = Delayed_flights_CRSDepHour, mapping = aes(x = CRSDepHour, y = Average_ArrDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of delay over the hours day",x = "CRS Departure Hour", y ="Average ArrDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5))

ggarrange(i,j,k,l, 
          labels = c("I", "J","K","L"),
          ncol = 2, nrow = 2)


  • Identify the best day of week to book a flight
Delayed_flights_Day <- Delayed_flights %>%
  group_by(DayOfWeek) %>%
  summarise(Count = sum(DelayedArr),Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay))

m <-ggplot(data = Delayed_flights_Day, mapping = aes(x = reorder(wday(DayOfWeek,label = TRUE),-Average_ArrDelay), y = Average_ArrDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
   labs(title = "Average Arrival Delay by Day",x = "Day", y ="Average ArrDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5))


  • Number of Arrival Delay by Day
n <- ggplot(data = Delayed_flights_Day, mapping = aes(x = reorder(wday(DayOfWeek,label = TRUE),-Count), y =Count)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Count,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Number of Arrival Delay by Day",x = "Day", y ="Arrival Delay Count") + 
  theme(plot.title = element_text(hjust = 0.5))

ggarrange(m,n, 
          labels = c("M", "N"),
          ncol = 2, nrow = 1)


  • Identify the best day of month to book a flight
Delayed_flights_Month <- Delayed_flights %>%
  group_by(Month) %>%
  summarise(Count= sum(DelayedArr),Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay))

o <- ggplot(data = Delayed_flights_Month, mapping = aes(x = reorder(month(Month,label = TRUE),-Average_ArrDelay), y = Average_ArrDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Average Arrival Delay by Month",x = "Month", y ="Average ArrDelay (min)")+
  theme(plot.title = element_text(hjust = 0.5)) 


  • Number of Arrival Delay by Month
p <- ggplot(data = Delayed_flights_Month, mapping = aes(x = reorder(month(Month,label = TRUE),-Count), y =Count)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Count,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Number of Arrival Delay by Day",x = "Month", y ="Arrival Delay Count") + 
  theme(plot.title = element_text(hjust = 0.5))

ggarrange(o,p, 
          labels = c("O", "P"),
          ncol = 2, nrow = 1)


*From the plots, we can identify the best time to minimize delays.


2) Do older planes suffer more delays?

plane_characteristic <- Delayed_flights 
  
plane_characteristic <- left_join(plane_characteristic, planes, by = c("TailNum"='tailnum'))%>%
  group_by(year) %>%
  summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay), Average_CarrierDelay = mean(CarrierDelay))


plane_characteristic <- plane_characteristic %>%
  filter(!is.na(year), plane_characteristic$year !="",plane_characteristic$year !="0000",plane_characteristic$year !="None",plane_characteristic$year !="2007")  



q<- ggplot(data = plane_characteristic, mapping = aes(x = year, y = Average_ArrDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of delay over the engine year of the plane",x = "Engine Year", y ="Average ArrDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


r<- ggplot(data = plane_characteristic, mapping = aes(x = year, y = Average_DepDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of delay over the engine year of the plane",x = "Engine Year", y ="Average DepDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


s<- ggplot(data = plane_characteristic, mapping = aes(x = year, y = Average_CarrierDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of delay over the engine year of the plane",x = "Engine Year", y ="Average CarrierDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))

ggarrange(q,r,s, 
          labels = c("Q", "R", "S"),
          ncol = 2, nrow = 2)

3) How does the number of people flying between different locations change over time?


Destination <- df_copy
Destination$season <- month(Destination$Date,label = TRUE) 


Destination$trip <-paste(Destination$Origin,Destination$Dest,sep="-")


Destination$season = with(Destination, ifelse(season%in% c("Dec","Jan","Feb") , "Winter",
                                       ifelse(season %in% c("Mar","Apr","May") , "Spring",
                                              ifelse(season %in% c("Jun","Jul","Aug") , "Summer",
                                                     ifelse(season %in% c("Sep","Oct","Nov") , "Autumn",
                                                            season)))))


Destination <- left_join(Destination, airports, by = c("Dest"='iata'))


Destination_trip <- Destination %>%
  group_by(season,trip)%>%
  summarise(count = n())
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
  Destination_trip %>% 
  arrange(desc(count)) %>%
  slice(1:10) %>%
  ggplot(., mapping = aes(x = reorder(trip, -count), y = count)) +
  facet_wrap(~season)+
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(count,0), fontface = 'bold', vjust = -0.2), size = 1.5) +
  labs(title = "Top 10 Most Frequent Trips",x = "Trip", y ="Total Trip Count") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


Destination_airport <- Destination %>%
  group_by(season,airport)%>%
  summarise(count = n())
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
Destination_airport %>% 
  arrange(desc(count)) %>%
  slice(1:10) %>%
  ggplot(., mapping = aes(x = reorder(airport, -count), y = count)) +
  facet_wrap(~season)+
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(count,0), fontface = 'bold', vjust = -0.2), size = 1.5) +
  labs(title = "Top 10 Most Frequent airports",x = "airport", y ="Total airport Count") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


Destination_city <- Destination %>%
  group_by(season,city)%>%
  summarise(count = n())
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
  Destination_city %>% 
  arrange(desc(count)) %>%
  slice(1:10) %>%
  ggplot(., mapping = aes(x = reorder(city, -count), y = count)) +
  facet_wrap(~season)+
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(count,0), fontface = 'bold', hjust = .5,vjust = 2), size = 2) +
  labs(title = "Top 10 Most Frequent citys",x = "city", y ="Total city Count") + 
  theme(plot.title = element_text(hjust = 0.5), # title
        axis.text.x=element_text(angle=90, hjust=1, size = 8), # x axis
        axis.text.y=element_text(hjust=1, size = 8), # y axis
        strip.text = element_text(size = 8))#facet text


Destination_state <- Destination %>%
  group_by(season,state)%>%
  summarise(count = n())
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.
  Destination_state %>% 
  arrange(desc(count)) %>%
  slice(1:10) %>%
  ggplot(., mapping = aes(x = reorder(state, -count), y = count)) +
  facet_wrap(~season)+
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(count,0), fontface = 'bold', hjust = .5,vjust = 2), size = 2) +
  labs(title = "Top 10 Most Frequent citys",x = "city", y ="Total city Count") + 
  theme(plot.title = element_text(hjust = 0.5), # title
        axis.text.x=element_text(angle=90, hjust=1, size = 8), # x axis
        axis.text.y=element_text(hjust=1, size = 8), # y axis
        strip.text = element_text(size = 8))#facet text

4) Can you detect cascading failures as delays in one airport create delays in others?



Airport <- filter(Destination,Destination$LateAircraftDelay > 0) 


Airport <- subset(Airport, Airport$DelayedArr == 1 & Airport$DelayedDep == 1)


Airport_cascading_failure <- subset(Airport, Airport$TailNum %in% c("N336","N337"))


no_cascading_failure <- Airport_cascading_failure %>%
        filter(Date =="2004-04-12" & FlightNum == 2967)%>%
        arrange(CRSDepHour)

t <- ggplot(data = no_cascading_failure, mapping = aes(x = CRSDepHour, y = LateAircraftDelay)) +
  geom_line(group=1) +
  geom_text(mapping = aes(label = LateAircraftDelay, fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "LateAircraftDelay on 2004-04-12 for tail N336",x = "CRSDepHour", y ="LateAircraftDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5))


cascading_failure <- Airport_cascading_failure %>%
  filter(Date =="2005-06-10" & TailNum == "N337")%>%
  arrange(CRSDepHour)

u <- ggplot(data = cascading_failure, mapping = aes(x = CRSDepHour, y = LateAircraftDelay)) +
  geom_line(group=1) +
  geom_text(mapping = aes(label = LateAircraftDelay, fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "LateAircraftDelay on 2005-06-10 for tail N337",x = "CRSDepHour", y ="LateAircraftDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5))

ggarrange(t,u, 
          labels = c("T", "U"),
          ncol = 2, nrow = 1)


Longest_carrier_delay_flights <- Airport %>%
  group_by(airport) %>%
  summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay), Average_CarrierDelay = mean(CarrierDelay))


v <- Longest_carrier_delay_flights %>% 
  arrange(desc(Average_DepDelay)) %>%
  slice(1:10) %>%
  ggplot(., mapping = aes(x = reorder(airport, -Average_DepDelay), y =Average_DepDelay)) +
  geom_bar(stat='identity',fill="steelblue")+
  geom_text(mapping = aes(label = round(Average_DepDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Top 10 airports with longest departure delay",x = "Airport", y ="Average DepDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


w <- Longest_carrier_delay_flights %>% 
  arrange(desc(Average_ArrDelay)) %>%
  slice(1:10) %>%
  ggplot(., mapping = aes(x = reorder(airport, -Average_ArrDelay), y =Average_ArrDelay)) +
  geom_bar(stat='identity',fill="steelblue")+
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Top 10 airports with longest arrival delay",x = "Airport", y ="Average ArrDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


x <- Longest_carrier_delay_flights %>% 
  arrange(desc(Average_CarrierDelay)) %>%
  slice(1:10) %>%
  ggplot(., mapping = aes(x = reorder(airport, -Average_CarrierDelay), y =Average_CarrierDelay)) +
  geom_bar(stat='identity',fill="steelblue")+
  geom_text(mapping = aes(label = round(Average_CarrierDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Top 10 airports with longest carrier delay",x = "Airport", y ="Average CarrierDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))

ggarrange(v,w,x,
          labels = c("V", "W", "X"),
          ncol = 2, nrow = 2)



Airline <- Delayed_flights 
Airline <- left_join(Airline, carriers, by = c("UniqueCarrier"='Code'))%>%
  group_by(UniqueCarrier) %>%
  summarise(Average_DepDelay = mean(DepDelay),Average_ArrDelay = mean(ArrDelay), Average_CarrierDelay = mean(CarrierDelay))


y <- ggplot(data = Airline, mapping = aes(x = UniqueCarrier, y = Average_ArrDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of arrival delay over Airline",x = "Airline Code", y ="Average ArrDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


z <- ggplot(data = Airline, mapping = aes(x = UniqueCarrier, y = Average_DepDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of departure delay over Airline",x = "Airline Code", y ="Average DepDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))


aa<- ggplot(data = Airline, mapping = aes(x = UniqueCarrier, y = Average_CarrierDelay)) +
  geom_bar(stat='identity',fill="steelblue") +
  geom_text(mapping = aes(label = round(Average_ArrDelay,0), fontface = 'bold', vjust = -0.2), size = 2) +
  labs(title = "Distribution of carrier delay over Airline",x = "Airline Code", y ="Average CarrierDelay (min)") + 
  theme(plot.title = element_text(hjust = 0.5),axis.text.x=element_text(angle=90, hjust=1))

ggarrange(y,z,aa,
          labels = c("Y", "Z", "AA"),
          ncol = 2, nrow = 2)

5) Use the available variables to construct a model that predicts delays.


prediction <- df_copy %>% filter(row_number() %% 1000 == 0) %>%
  dplyr::select(c(DelayedArr,CRSDepHour,Origin,UniqueCarrier,Month,Distance))


prediction[prediction$DelayedArr == 0,]$DelayedArr <- "on-time"
prediction[prediction$DelayedArr == 1,]$DelayedArr <- "delayed"
prediction$DelayedArr <- as.factor(prediction$DelayedArr)


lbl <- LabelEncoder$new()

prediction$CRSDepHour <- lbl$fit_transform(prediction$CRSDepHour)
prediction$Origin <- lbl$fit_transform(prediction$Origin)
prediction$UniqueCarrier <- lbl$fit_transform(prediction$UniqueCarrier)


set.seed(1)
training_sample <- createDataPartition(prediction$DelayedArr,p = 0.8, list = FALSE)


dataset<- prediction[training_sample,]


validation <- prediction[-training_sample,]
output <- dataset[,1]

input <- dataset[,2:6]


par(mfrow=c(1,5))
for(i in 2:6) {vioplot(prediction[,i], main=names(input)[i-1])}


control <- trainControl(method="cv", number=10)
metric <- "Accuracy"


table(dataset$DelayedArr)
## 
## delayed on-time 
##    3041   13244


prop.table(table(dataset$DelayedArr))
## 
##   delayed   on-time 
## 0.1867363 0.8132637
par(mfrow=c(1, 1))
barplot(prop.table(table(dataset$DelayedArr)),
        ylim = c(0, 0.9),
        main = "Class Distribution")



set.seed(1)
undersampling <- ovun.sample(DelayedArr~., data=dataset, method = "under",N = 6082)$data
table(undersampling$DelayedArr)
## 
## on-time delayed 
##    3041    3041


set.seed(1)
oversampling <- ovun.sample(DelayedArr~., data=dataset, method = "over",N = 26488)$data
table(oversampling$DelayedArr)
## 
## on-time delayed 
##   13244   13244


set.seed(1)
bothsampling <- ovun.sample(DelayedArr~., data=dataset, method = "both",N = 16285, p=.5)$data
table(bothsampling$DelayedArr)
## 
## on-time delayed 
##    8189    8096


tree.over <- rpart(DelayedArr~., data=oversampling)
tree.under <- rpart(DelayedArr~., data=undersampling)
tree.both <- rpart(DelayedArr~., data=bothsampling)

pred.tree.over <- predict(tree.over, newdata = dataset)
pred.tree.under <- predict(tree.under, newdata = dataset)
pred.tree.both <- predict(tree.both, newdata = dataset)


roc.curve(dataset$DelayedArr, pred.tree.under[,2])

## Area under the curve (AUC): 0.580


roc.curve(dataset$DelayedArr, pred.tree.over[,2])

## Area under the curve (AUC): 0.550


roc.curve(dataset$DelayedArr, pred.tree.both[,2])

## Area under the curve (AUC): 0.579

Logistic Regression


  • Train
set.seed(1)
fit.glm_undersampling <- train(DelayedArr~., data=undersampling, method="glm", metric=metric, trControl=control, family="binomial")
set.seed(1)
fit.glm_oversampling <- train(DelayedArr~., data=oversampling, method="glm", metric=metric, trControl=control, family="binomial")
set.seed(1)
fit.glm_bothsampling <- train(DelayedArr~., data=bothsampling, method="glm", metric=metric, trControl=control, family="binomial")


  • Predict
predictions_glm_undersampling <- predict(fit.glm_undersampling, validation)
predictions_glm_oversampling <- predict(fit.glm_oversampling, validation)
predictions_glm_bothsampling <- predict(fit.glm_bothsampling, validation)


  • Confusion Matrix
confusionMatrix(predictions_glm_undersampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_glm_undersampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     430    1545
##    on-time     330    1766
##                                          
##                Accuracy : 0.5394         
##                  95% CI : (0.524, 0.5548)
##     No Information Rate : 0.8133         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0614         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.5658         
##             Specificity : 0.5334         
##          Pos Pred Value : 0.2177         
##          Neg Pred Value : 0.8426         
##              Prevalence : 0.1867         
##          Detection Rate : 0.1056         
##    Detection Prevalence : 0.4851         
##       Balanced Accuracy : 0.5496         
##                                          
##        'Positive' Class : delayed        
## 
confusionMatrix(predictions_glm_oversampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_glm_oversampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     429    1549
##    on-time     331    1762
##                                           
##                Accuracy : 0.5382          
##                  95% CI : (0.5227, 0.5536)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0598          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5645          
##             Specificity : 0.5322          
##          Pos Pred Value : 0.2169          
##          Neg Pred Value : 0.8419          
##              Prevalence : 0.1867          
##          Detection Rate : 0.1054          
##    Detection Prevalence : 0.4859          
##       Balanced Accuracy : 0.5483          
##                                           
##        'Positive' Class : delayed         
## 
confusionMatrix(predictions_glm_bothsampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_glm_bothsampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     404    1483
##    on-time     356    1828
##                                           
##                Accuracy : 0.5483          
##                  95% CI : (0.5328, 0.5636)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0533          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.53158         
##             Specificity : 0.55210         
##          Pos Pred Value : 0.21410         
##          Neg Pred Value : 0.83700         
##              Prevalence : 0.18669         
##          Detection Rate : 0.09924         
##    Detection Prevalence : 0.46352         
##       Balanced Accuracy : 0.54184         
##                                           
##        'Positive' Class : delayed         
## 


  • Results
results <- resamples(list(under=fit.glm_undersampling, over=fit.glm_oversampling, both=fit.glm_bothsampling))

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: under, over, both 
## Number of resamples: 10 
## 
## Accuracy 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## under 0.5180921 0.5242599 0.5382124 0.5391318 0.5449690 0.5789474    0
## over  0.5311438 0.5405814 0.5502927 0.5475695 0.5545695 0.5590789    0
## both  0.5224064 0.5264128 0.5296283 0.5357062 0.5475057 0.5518723    0
## 
## Kappa 
##             Min.    1st Qu.     Median       Mean    3rd Qu.      Max. NA's
## under 0.03618421 0.04851974 0.07644376 0.07827149 0.08996921 0.1578947    0
## over  0.06226774 0.08116872 0.10057775 0.09513666 0.10913897 0.1181582    0
## both  0.04466697 0.05264292 0.05907156 0.07131276 0.09503272 0.1035835    0

Random Forest


  • Train
set.seed(1)
fit.rf_undersampling <- train(DelayedArr~., data=undersampling, method="rf", metric=metric, trControl=control)
set.seed(1)
fit.rf_oversampling <- train(DelayedArr~., data=oversampling, method="rf", metric=metric, trControl=control)
set.seed(1)
fit.rf_bothsampling <- train(DelayedArr~., data=bothsampling, method="rf", metric=metric, trControl=control)


  • Predict
predictions_rf_undersampling <- predict(fit.rf_undersampling, validation)
predictions_rf_oversampling <- predict(fit.rf_oversampling, validation)
predictions_rf_bothsampling <- predict(fit.rf_bothsampling, validation)


  • Confusion Matrix
confusionMatrix(predictions_rf_undersampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_rf_undersampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     422    1510
##    on-time     338    1801
##                                           
##                Accuracy : 0.5461          
##                  95% CI : (0.5306, 0.5614)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0622          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5553          
##             Specificity : 0.5439          
##          Pos Pred Value : 0.2184          
##          Neg Pred Value : 0.8420          
##              Prevalence : 0.1867          
##          Detection Rate : 0.1037          
##    Detection Prevalence : 0.4746          
##       Balanced Accuracy : 0.5496          
##                                           
##        'Positive' Class : delayed         
## 
confusionMatrix(predictions_rf_oversampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_rf_oversampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     130     360
##    on-time     630    2951
##                                           
##                Accuracy : 0.7568          
##                  95% CI : (0.7433, 0.7699)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0722          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.17105         
##             Specificity : 0.89127         
##          Pos Pred Value : 0.26531         
##          Neg Pred Value : 0.82407         
##              Prevalence : 0.18669         
##          Detection Rate : 0.03193         
##    Detection Prevalence : 0.12036         
##       Balanced Accuracy : 0.53116         
##                                           
##        'Positive' Class : delayed         
## 
confusionMatrix(predictions_rf_bothsampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_rf_bothsampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     242     725
##    on-time     518    2586
##                                           
##                Accuracy : 0.6947          
##                  95% CI : (0.6803, 0.7088)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.09            
##                                           
##  Mcnemar's Test P-Value : 5.129e-09       
##                                           
##             Sensitivity : 0.31842         
##             Specificity : 0.78103         
##          Pos Pred Value : 0.25026         
##          Neg Pred Value : 0.83312         
##              Prevalence : 0.18669         
##          Detection Rate : 0.05944         
##    Detection Prevalence : 0.23753         
##       Balanced Accuracy : 0.54973         
##                                           
##        'Positive' Class : delayed         
## 


  • Results
results <- resamples(list(under=fit.rf_undersampling, over=fit.rf_oversampling, both=fit.rf_bothsampling))

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: under, over, both 
## Number of resamples: 10 
## 
## Accuracy 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## under 0.5197368 0.5431743 0.5542763 0.5527817 0.5676138 0.5773026    0
## over  0.9233384 0.9283692 0.9305265 0.9317041 0.9367686 0.9384906    0
## both  0.8919583 0.8955774 0.8989867 0.8997853 0.9046811 0.9097052    0
## 
## Kappa 
##             Min.    1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## under 0.03947368 0.08634868 0.1085526 0.1055776 0.1352482 0.1546053    0
## over  0.84667674 0.85674166 0.8610554 0.8634082 0.8735329 0.8769811    0
## both  0.78398025 0.79120678 0.7980411 0.7996424 0.8094338 0.8194768    0

CART


  • Train
set.seed(1)
fit.cart_undersampling <- train(DelayedArr~., data=undersampling, method="rpart", metric=metric, trControl=control)
set.seed(1)
fit.cart_oversampling <- train(DelayedArr~., data=oversampling, method="rpart", metric=metric, trControl=control)
set.seed(1)
fit.cart_bothsampling <- train(DelayedArr~., data=bothsampling, method="rpart", metric=metric, trControl=control)


  • Predict
predictions_cart_undersampling <- predict(fit.cart_undersampling, validation)
predictions_cart_oversampling <- predict(fit.cart_oversampling, validation)
predictions_cart_bothsampling <- predict(fit.cart_bothsampling, validation)


  • Confusion Matrix
confusionMatrix(predictions_cart_undersampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_cart_undersampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     440    1579
##    on-time     320    1732
##                                           
##                Accuracy : 0.5335          
##                  95% CI : (0.5181, 0.5489)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0623          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5789          
##             Specificity : 0.5231          
##          Pos Pred Value : 0.2179          
##          Neg Pred Value : 0.8441          
##              Prevalence : 0.1867          
##          Detection Rate : 0.1081          
##    Detection Prevalence : 0.4959          
##       Balanced Accuracy : 0.5510          
##                                           
##        'Positive' Class : delayed         
## 
confusionMatrix(predictions_cart_oversampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_cart_oversampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     507    1782
##    on-time     253    1529
##                                           
##                Accuracy : 0.5001          
##                  95% CI : (0.4846, 0.5156)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0726          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6671          
##             Specificity : 0.4618          
##          Pos Pred Value : 0.2215          
##          Neg Pred Value : 0.8580          
##              Prevalence : 0.1867          
##          Detection Rate : 0.1245          
##    Detection Prevalence : 0.5623          
##       Balanced Accuracy : 0.5644          
##                                           
##        'Positive' Class : delayed         
## 
confusionMatrix(predictions_cart_bothsampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_cart_bothsampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     469    1758
##    on-time     291    1553
##                                           
##                Accuracy : 0.4967          
##                  95% CI : (0.4812, 0.5122)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0494          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6171          
##             Specificity : 0.4690          
##          Pos Pred Value : 0.2106          
##          Neg Pred Value : 0.8422          
##              Prevalence : 0.1867          
##          Detection Rate : 0.1152          
##    Detection Prevalence : 0.5470          
##       Balanced Accuracy : 0.5431          
##                                           
##        'Positive' Class : delayed         
## 


  • Results
results <- resamples(list(under=fit.cart_undersampling, over=fit.cart_oversampling, both=fit.cart_bothsampling))

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: under, over, both 
## Number of resamples: 10 
## 
## Accuracy 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## under 0.5328947 0.5439967 0.5603961 0.5552424 0.5675557 0.5690789    0
## over  0.5677614 0.5733838 0.5755804 0.5765629 0.5810683 0.5858815    0
## both  0.5307125 0.5419226 0.5551129 0.5522230 0.5604665 0.5690608    0
## 
## Kappa 
##             Min.    1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## under 0.06578947 0.08799342 0.1209316 0.1104634 0.1347427 0.1381579    0
## over  0.13558603 0.14670775 0.1511140 0.1531257 0.1621915 0.1718376    0
## both  0.06120970 0.08352119 0.1107704 0.1050697 0.1221803 0.1401949    0

KNN


  • Train
set.seed(1)
fit.knn_undersampling <- train(DelayedArr~., data=undersampling, method="knn", metric=metric, trControl=control)
set.seed(1)
fit.knn_oversampling <- train(DelayedArr~., data=oversampling, method="knn", metric=metric, trControl=control)
set.seed(1)
fit.knn_bothsampling <- train(DelayedArr~., data=bothsampling, method="knn", metric=metric, trControl=control)


  • Predict
predictions_knn_undersampling <- predict(fit.knn_undersampling, validation)
predictions_knn_oversampling <- predict(fit.knn_oversampling, validation)
predictions_knn_bothsampling <- predict(fit.knn_bothsampling, validation)


  • Confusion Matrix
confusionMatrix(predictions_knn_undersampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_knn_undersampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     397    1619
##    on-time     363    1692
##                                           
##                Accuracy : 0.5131          
##                  95% CI : (0.4977, 0.5286)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0204          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.52237         
##             Specificity : 0.51102         
##          Pos Pred Value : 0.19692         
##          Neg Pred Value : 0.82336         
##              Prevalence : 0.18669         
##          Detection Rate : 0.09752         
##    Detection Prevalence : 0.49521         
##       Balanced Accuracy : 0.51670         
##                                           
##        'Positive' Class : delayed         
## 
confusionMatrix(predictions_knn_oversampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_knn_oversampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     404    1654
##    on-time     356    1657
##                                           
##                Accuracy : 0.5063          
##                  95% CI : (0.4908, 0.5217)
##     No Information Rate : 0.8133          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0193          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.53158         
##             Specificity : 0.50045         
##          Pos Pred Value : 0.19631         
##          Neg Pred Value : 0.82315         
##              Prevalence : 0.18669         
##          Detection Rate : 0.09924         
##    Detection Prevalence : 0.50553         
##       Balanced Accuracy : 0.51602         
##                                           
##        'Positive' Class : delayed         
## 
confusionMatrix(predictions_knn_bothsampling, validation$DelayedArr)
## Warning in confusionMatrix.default(predictions_knn_bothsampling,
## validation$DelayedArr): Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction delayed on-time
##    delayed     405    1556
##    on-time     355    1755
##                                          
##                Accuracy : 0.5306         
##                  95% CI : (0.5151, 0.546)
##     No Information Rate : 0.8133         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0391         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.53289        
##             Specificity : 0.53005        
##          Pos Pred Value : 0.20653        
##          Neg Pred Value : 0.83175        
##              Prevalence : 0.18669        
##          Detection Rate : 0.09948        
##    Detection Prevalence : 0.48170        
##       Balanced Accuracy : 0.53147        
##                                          
##        'Positive' Class : delayed        
## 


  • Results
results <- resamples(list(under=fit.knn_undersampling, over=fit.knn_oversampling, both=fit.knn_bothsampling))

summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: under, over, both 
## Number of resamples: 10 
## 
## Accuracy 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## under 0.4851974 0.5102796 0.5160220 0.5152923 0.5246711 0.5312500    0
## over  0.6752266 0.6806342 0.6849157 0.6854415 0.6884968 0.7006418    0
## both  0.6277641 0.6513198 0.6562308 0.6561267 0.6647727 0.6793612    0
## 
## Kappa 
##              Min.    1st Qu.     Median       Mean    3rd Qu.      Max. NA's
## under -0.02960526 0.02055921 0.03204792 0.03058133 0.04934211 0.0625000    0
## over   0.35045317 0.36136788 0.36978608 0.37088544 0.37692435 0.4012027    0
## both   0.25648767 0.30351370 0.31327833 0.31305003 0.33037773 0.3593412    0


  • Total Result
results_total <- resamples(list(under_glm=fit.glm_undersampling, over_glm=fit.glm_oversampling, both_glm=fit.glm_bothsampling,
under_rf=fit.rf_undersampling, over_rf=fit.rf_oversampling, both_rf=fit.rf_bothsampling,
under_cart=fit.cart_undersampling, over_cart=fit.cart_oversampling, both_cart=fit.cart_bothsampling,
under_knn=fit.knn_undersampling, over_knn=fit.knn_oversampling, both_knn=fit.knn_bothsampling))

summary(results_total)
## 
## Call:
## summary.resamples(object = results_total)
## 
## Models: under_glm, over_glm, both_glm, under_rf, over_rf, both_rf, under_cart, over_cart, both_cart, under_knn, over_knn, both_knn 
## Number of resamples: 10 
## 
## Accuracy 
##                 Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## under_glm  0.5180921 0.5242599 0.5382124 0.5391318 0.5449690 0.5789474    0
## over_glm   0.5311438 0.5405814 0.5502927 0.5475695 0.5545695 0.5590789    0
## both_glm   0.5224064 0.5264128 0.5296283 0.5357062 0.5475057 0.5518723    0
## under_rf   0.5197368 0.5431743 0.5542763 0.5527817 0.5676138 0.5773026    0
## over_rf    0.9233384 0.9283692 0.9305265 0.9317041 0.9367686 0.9384906    0
## both_rf    0.8919583 0.8955774 0.8989867 0.8997853 0.9046811 0.9097052    0
## under_cart 0.5328947 0.5439967 0.5603961 0.5552424 0.5675557 0.5690789    0
## over_cart  0.5677614 0.5733838 0.5755804 0.5765629 0.5810683 0.5858815    0
## both_cart  0.5307125 0.5419226 0.5551129 0.5522230 0.5604665 0.5690608    0
## under_knn  0.4851974 0.5102796 0.5160220 0.5152923 0.5246711 0.5312500    0
## over_knn   0.6752266 0.6806342 0.6849157 0.6854415 0.6884968 0.7006418    0
## both_knn   0.6277641 0.6513198 0.6562308 0.6561267 0.6647727 0.6793612    0
## 
## Kappa 
##                   Min.    1st Qu.     Median       Mean    3rd Qu.      Max.
## under_glm   0.03618421 0.04851974 0.07644376 0.07827149 0.08996921 0.1578947
## over_glm    0.06226774 0.08116872 0.10057775 0.09513666 0.10913897 0.1181582
## both_glm    0.04466697 0.05264292 0.05907156 0.07131276 0.09503272 0.1035835
## under_rf    0.03947368 0.08634868 0.10855263 0.10557763 0.13524822 0.1546053
## over_rf     0.84667674 0.85674166 0.86105538 0.86340824 0.87353293 0.8769811
## both_rf     0.78398025 0.79120678 0.79804109 0.79964241 0.80943378 0.8194768
## under_cart  0.06578947 0.08799342 0.12093158 0.11046342 0.13474274 0.1381579
## over_cart   0.13558603 0.14670775 0.15111395 0.15312575 0.16219145 0.1718376
## both_cart   0.06120970 0.08352119 0.11077038 0.10506973 0.12218029 0.1401949
## under_knn  -0.02960526 0.02055921 0.03204792 0.03058133 0.04934211 0.0625000
## over_knn    0.35045317 0.36136788 0.36978608 0.37088544 0.37692435 0.4012027
## both_knn    0.25648767 0.30351370 0.31327833 0.31305003 0.33037773 0.3593412
##            NA's
## under_glm     0
## over_glm      0
## both_glm      0
## under_rf      0
## over_rf       0
## both_rf       0
## under_cart    0
## over_cart     0
## both_cart     0
## under_knn     0
## over_knn      0
## both_knn      0
  • Among different models, we could pick the best model with appropriate value of accuracy, sensitivity and specificity which are a good indicators to evaluate the model.However, for this research, we will deep dive into Random Forest.

Tuning Random forest in R

  • We will be using undersampling result to tune the model
fit.rf_undersampling
## Random Forest 
## 
## 6082 samples
##    5 predictor
##    2 classes: 'on-time', 'delayed' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 5474, 5474, 5474, 5473, 5474, 5474, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa     
##   2     0.5498204  0.09964762
##   3     0.5513017  0.10261402
##   5     0.5527817  0.10557763
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
  • It shows that mtry of 5 has the highest accuracy for fit.rf_undersampling

  • Grid Search

control_grid <- trainControl(method="cv", number=10 , search="grid")

set.seed(1)

fit.rf_undersampling_grid <- train(DelayedArr~., data=undersampling, method="rf", metric=metric, trControl=control_grid)

print(fit.rf_undersampling_grid)
## Random Forest 
## 
## 6082 samples
##    5 predictor
##    2 classes: 'on-time', 'delayed' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 5474, 5474, 5474, 5473, 5474, 5474, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa     
##   2     0.5498204  0.09964762
##   3     0.5513017  0.10261402
##   5     0.5527817  0.10557763
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
plot(fit.rf_undersampling_grid)

  • Grid-search takes quite long time to load, hence random search is often used instead. Although it is done within certain amount of time, it has systematic way to find the appropriate model instead of just putting random values several times. Moreover, it is also called coarse search because it proceeds two to three times and gradually narrows down.

  • RandomSearch

control_random <- trainControl(method="cv", number=10, search="random")

set.seed(1)

fit.rf_undersampling_random <- train(DelayedArr~., data=undersampling, method="rf", metric=metric, trControl=control_random)

print(fit.rf_undersampling_random)
## Random Forest 
## 
## 6082 samples
##    5 predictor
##    2 classes: 'on-time', 'delayed' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 5474, 5474, 5474, 5473, 5474, 5474, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa     
##   1     0.5753079  0.15061929
##   2     0.5555724  0.11114867
##   5     0.5494928  0.09899791
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 1.
plot(fit.rf_undersampling_random)


  • Furthermore, to identify the importance of each variable, we will be using permutation based feature importance method to identify.

random forest model

rf_permutation <- ranger(DelayedArr~ ., dataset, importance = 'permutation')


  • barplot
rf_permutation_importance <- data.frame(
  var = names(rf_permutation$variable.importance), imp = c(rf_permutation$variable.importance))
rf_permutation_importance$data_type <- 'permutation'

ggplot(rf_permutation_importance) + 
  geom_bar(aes(x=reorder(var, imp), y=imp), stat = 'identity',fill="steelblue") +
  labs(title = "Importance of variables",x = "Variables", y ="Importance") + 
  theme(plot.title = element_text(hjust = 0.5))

  • From the graph, we can identify that “CRSDepHour”,“Distance” and “Origin” have high influence to the model.Thus for further research, we could optimize the best parameter for get the best result of the model.